# Import dependencies
from collections import deque
from datetime import datetime
from math import sqrt
from openpyxl import load_workbook
from plotly.subplots import make_subplots
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import torch
import torch.nn.functional as F
# Set random seed for result's reproduction
SEED = 1234
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed(SEED)
Даны обращения клиентов в банк за 2018-2019гг., необходимо построить прогноз на первое полугодие 2020г.
Для прогнозирования обращений используется нейронная сеть, которая выполняет one-step прогноз на основе следующих признаков:
- предшествующие значения обращений (количество равно лагу)
- день недели (0 - 6, где 0 - Понедельник)
- выходной/ не выходной (0 или 1, где 1- выходной)
data = pd.read_excel(
'Задание прогноз.xlsx', sheet_name=0, engine='openpyxl',
index_col=0,
usecols=[0, 1],
nrows=730
)
data.rename(columns=lambda x: x.strip(), inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 730 entries, 2018-01-01 to 2019-12-31 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Обращения 730 non-null int64 dtypes: int64(1) memory usage: 11.4 KB
# Visualize data
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=data.index,
y=data['Обращения'],
name='Количество обращений клиентов',
mode='lines',
line=dict(color='blue')
)
)
fig.update_layout(title=dict(text="Количество обращений клиентов"))
fig.update_xaxes(title=dict(text='Дата'))
fig.update_yaxes(title=dict(text='Количество обращений'))
fig.show()
# Add day of week as additional feature
# From the figure above there is strong correlation timeseries data with day of week
data['День недели'] = data.index.dayofweek
# Add days-off as additional feature
days_off = ('01-01', '01-02', '01-03', '01-04', '01-05', '01-06', '01-07',
'02-23',
'03-08',
'05-01', '05-02', '05-03', '05-09', '05-10'
'06-12',
'11-04',
'12-31')
def get_day_off(x):
"""Defines day-off or not."""
day_off = x.name.strftime("%m-%d")
return 1 if day_off in days_off else 0
# Add days off
data['Праздники'] = data.apply(get_day_off, axis=1)
data.head(10)
| Обращения | День недели | Праздники | |
|---|---|---|---|
| Дата | |||
| 2018-01-01 | 4177 | 0 | 1 |
| 2018-01-02 | 8375 | 1 | 1 |
| 2018-01-03 | 13324 | 2 | 1 |
| 2018-01-04 | 15566 | 3 | 1 |
| 2018-01-05 | 16257 | 4 | 1 |
| 2018-01-06 | 13784 | 5 | 1 |
| 2018-01-07 | 11406 | 6 | 1 |
| 2018-01-08 | 17867 | 0 | 0 |
| 2018-01-09 | 28380 | 1 | 0 |
| 2018-01-10 | 29324 | 2 | 0 |
LAG = 14 # size of sliding window, in days
# Divide data into training and validation sets
date_train_threshold = datetime(2019, 12, 17)
train_data = data[:date_train_threshold].copy()
val_data = data[date_train_threshold - pd.Timedelta(LAG-1, 'd'):].copy()
# Normalize data
scaler = MinMaxScaler()
train_data_norm = train_data.copy()
train_data_norm['Обращения'] = scaler.fit_transform(train_data[['Обращения']])
val_data_norm = val_data.copy()
val_data_norm['Обращения'] = scaler.transform(val_data[['Обращения']])
# Helper function for creating dataset
def create_samples(data, lag=7, data_augmentation=False,
n_times=1, mu=0., sigma=0.1):
"""Creates input/ output samples according to the following rule:
If X1, X2, X3, X4, X5, X6, X7, ..., Xn is the time series
and 'lag' is 3, so samples wil be:
X y
[X1, X2, X3] -> X4,
[X2, X3, X4] -> X5,
...
[Xn-3, Xn-2, Xn-1] -> Xn
Parameters
----------
data: pd.DataFrame. Time series data.
lag: int. Size of sliding window.
data_augmentation: bool. If True, initial data will be augmented with new data.
n_times: int. How many times new data will be added to initial data.
mu: float. Mean for noise.
Noise is used to change initial data in order to create new data for augmentation.
sigma: float. Standard deviation for noise.
Noise is used to change initial data in order to create new data for augmentation.
Returns
-------
Samples: tuple. Input and output data.
"""
num_samples = len(data) - lag
X = np.zeros((num_samples, lag))
day_of_week = np.zeros(num_samples)
day_off = np.zeros(num_samples)
y = np.zeros(num_samples)
for i, d in enumerate(data.index[:-lag]):
X[i, :] = data[d: d+pd.Timedelta(lag-1, 'd')]['Обращения'].values
day_of_week[i] = data.loc[d+pd.Timedelta(lag, 'd')]['День недели']
day_off[i] = data.loc[d+pd.Timedelta(lag, 'd')]['Праздники']
y[i] = data.loc[d+pd.Timedelta(lag, 'd')]['Обращения']
# Add new slightly modified data to initial data
if data_augmentation:
for i in range(n_times):
idx = np.random.randint(0, num_samples-10)
noise_X = np.random.normal(mu, sigma, size=(len(X)-idx, lag))
augment_X = np.clip(X[idx:] + noise_X, a_min=0, a_max=None)
X = np.concatenate((X, augment_X), axis=0)
day_of_week = np.concatenate((day_of_week, day_of_week[idx:]), axis=0)
day_off = np.concatenate((day_off, day_off[idx:]), axis=0)
augment_y = np.clip(y[idx:] + np.mean(noise_X, axis=1), a_min=0, a_max=None)
y = np.concatenate((y, augment_y), axis=0)
return X, day_of_week, day_off, y
BATCH_SIZE = 20 # batch size for DataLoader
train = create_samples(
train_data_norm, LAG,
data_augmentation=True, n_times=2,
mu=0., sigma=0.005
)
val = create_samples(
val_data_norm, LAG,
data_augmentation=False
)
# Create Dataloaders for neural network
train_ds = TensorDataset(
torch.tensor(train[0], dtype=torch.float),
torch.tensor(train[1], dtype=torch.long),
torch.tensor(train[2], dtype=torch.float),
torch.tensor(train[3], dtype=torch.float)
)
val_ds = TensorDataset(
torch.tensor(val[0], dtype=torch.float),
torch.tensor(val[1], dtype=torch.long),
torch.tensor(val[2], dtype=torch.float),
torch.tensor(val[3], dtype=torch.float)
)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True)
# Define neural network
class CNNRegression(nn.Module):
def __init__(self, lag):
super(CNNRegression, self).__init__()
self.lag = lag
self.conv1 = nn.Conv1d(1, 16, kernel_size=7, padding=3)
self.conv1_bn = nn.BatchNorm1d(16)
self.conv2 = nn.Conv1d(16, 32, kernel_size=7, padding=3)
self.pool = nn.MaxPool1d(2)
self.embedding1 = nn.Embedding(7, 5, max_norm=1)
self.linear1 = nn.Linear(1, 3)
self.linear2 = nn.Linear(232, 64)
self.linear3 = nn.Linear(64, 16)
self.linear4 = nn.Linear(16, 1)
self.drop = nn.Dropout(p=0.1)
def forward(self, x, day_of_week, day_off):
x = x.view(-1, 1, self.lag)
x = F.selu(self.conv1_bn(self.conv1(x)))
x = self.pool(x)
x = F.selu(self.conv2(x))
x = torch.flatten(x, start_dim=1)
day_of_week = self.embedding1(day_of_week)
day_off = day_off.view(-1, 1)
day_off = F.selu(self.linear1(day_off))
combined = torch.cat(
(x, day_of_week, day_off),
dim=1
)
out = F.selu(self.linear2(combined))
out = self.drop(out)
out = F.selu(self.linear3(out))
out = self.linear4(out)
return out
def weights_init(m):
if isinstance(m, nn.Linear):
y = 1. / np.sqrt(m.in_features)
m.weight.data.uniform_(-y, y)
m.bias.data.fill_(0.)
elif isinstance(m, nn.Conv1d):
y = 1. / np.sqrt(m.in_channels)
m.weight.data.uniform_(-y, y)
if m.bias is not None:
m.bias.data.fill_(0.)
# Helper function to train model
def training(model, criterion, optimizer, data_loader, device):
"""Iterate over dataset in order to train model.
Parameters
----------
model: Neural network model.
criterion: Optimization criterion.
data_loader: Iterator which yields data.
device: torch.device.
Returns
-------
train_loss: loss value.
"""
model.train()
train_loss = 0
for samples in data_loader:
optimizer.zero_grad()
x = samples[0].to(device)
day_of_week = samples[1].to(device)
day_off = samples[2].to(device)
targets = samples[3].to(device)
output = model.forward(x, day_of_week, day_off)
loss = criterion(torch.squeeze(output, dim=1), targets)
train_loss += loss.item() * len(samples)
loss.backward()
optimizer.step()
else:
train_loss /= len(data_loader.dataset)
return train_loss
# Helper function to validate model
def validating(model, criterion, data_loader, device):
"""Iterate over dataset in order to check model on validation set.
Parameters
----------
model: Neural network model.
criterion: Optimization criterion.
data_loader: Iterator which yields data.
device: torch.device.
Returns
-------
val_loss: loss value.
"""
model.eval()
val_loss = 0
with torch.no_grad():
for samples in data_loader:
x = samples[0].to(device)
day_of_week = samples[1].to(device)
day_off = samples[2].to(device)
targets = samples[3].to(device)
output = model.forward(x, day_of_week, day_off)
loss = criterion(torch.squeeze(output, dim=1), targets)
val_loss += loss.item() * len(samples)
else:
val_loss /= len(data_loader.dataset)
return val_loss
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CNNRegression(lag=LAG).to(device)
criterion = nn.MSELoss()
EPOCHS = 20 # number of training epochs
# Train and validate model
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.995)
for epoch in range(EPOCHS):
train_loss = training(model, criterion, optimizer, train_loader, device)
val_loss = validating(model, criterion, val_loader, device)
scheduler.step()
print(f"Epoch {epoch+1} | train loss: {train_loss:.5f} | "
f"val loss: {val_loss:.5f}")
Epoch 1 | train loss: 0.00371 | val loss: 0.00369 Epoch 2 | train loss: 0.00179 | val loss: 0.00230 Epoch 3 | train loss: 0.00125 | val loss: 0.00107 Epoch 4 | train loss: 0.00119 | val loss: 0.00060 Epoch 5 | train loss: 0.00115 | val loss: 0.00083 Epoch 6 | train loss: 0.00114 | val loss: 0.00095 Epoch 7 | train loss: 0.00095 | val loss: 0.00079 Epoch 8 | train loss: 0.00091 | val loss: 0.00049 Epoch 9 | train loss: 0.00090 | val loss: 0.00216 Epoch 10 | train loss: 0.00092 | val loss: 0.00114 Epoch 11 | train loss: 0.00088 | val loss: 0.00155 Epoch 12 | train loss: 0.00082 | val loss: 0.00139 Epoch 13 | train loss: 0.00073 | val loss: 0.00056 Epoch 14 | train loss: 0.00082 | val loss: 0.00057 Epoch 15 | train loss: 0.00074 | val loss: 0.00087 Epoch 16 | train loss: 0.00072 | val loss: 0.00146 Epoch 17 | train loss: 0.00075 | val loss: 0.00047 Epoch 18 | train loss: 0.00072 | val loss: 0.00100 Epoch 19 | train loss: 0.00072 | val loss: 0.00034 Epoch 20 | train loss: 0.00068 | val loss: 0.00083
# Save model
torch.save(
{
'model': CNNRegression(lag=LAG),
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss,
'val_loss': val_loss
},
f"model_CNN.pt"
)
# Calculate predictions
forecast_period = pd.date_range(data.index[-1] + pd.Timedelta(1, 'd'), datetime(2020, 6, 30))
input_x = deque(
scaler.transform(
data[data.index[-1] - pd.Timedelta(LAG-1, 'd'): data.index[-1]][['Обращения']]
).reshape(-1),
maxlen=LAG
)
predictions = []
for date_ in forecast_period:
input_day_of_week = date_.weekday()
input_day_off = 1 if date_.strftime("%m-%d") in days_off else 0
prediction = model.forward(
torch.tensor(input_x, dtype=torch.float).unsqueeze(0).to(device),
torch.tensor(input_day_of_week, dtype=torch.long).unsqueeze(0).to(device),
torch.tensor(input_day_off, dtype=torch.float).unsqueeze(0).to(device)
).item()
predictions.append(prediction)
input_x.append(prediction)
predictions = list(
map(int, scaler.inverse_transform(np.array(predictions).reshape(-1, 1)))
)
# Visualize predictions
fig_predictions = go.Figure(fig)
fig_predictions.add_trace(
go.Scatter(
x=forecast_period,
y=predictions,
name=f"Forecast Neural Net",
mode='lines',
line=dict(color='red')
)
)
fig_predictions.show()
# Save result into given file
book = load_workbook('Задание прогноз.xlsx')
sheet1 = book['Задание_1']
for i in range(len(predictions)):
sheet1[f"B{732+i}"] = predictions[i]
book.save('Задание прогноз.xlsx')
data2 = pd.read_excel(
'Задание прогноз.xlsx', sheet_name=1, engine='openpyxl', usecols=[0, 1]
)
data2.rename(columns=lambda x: x.strip(), inplace=True)
data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17 entries, 0 to 16 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Стаж 17 non-null float64 1 ЗП 16 non-null float64 dtypes: float64(2) memory usage: 400.0 bytes
# Visualize data
fig2 = go.Figure()
fig2.add_trace(
go.Scatter(
x=data2['Стаж'],
y=data2['ЗП'],
name='Зависимость ЗП от стажа',
mode='lines',
line=dict(color='blue')
)
)
fig2.update_layout(title=dict(text='Зависимость ЗП от стажа'))
fig2.update_xaxes(title=dict(text='Стаж'))
fig2.update_yaxes(title=dict(text='Объём ЗП'))
fig2.show()
There is a slightly exponential dependencies between time and salary.
work_years = data2['Стаж'].values[:-1]
salary = data2['ЗП'].values[:-1]
# Define function for fitting given data
def f(x, b0, b1, b2):
return b0 + b1 * np.exp(b2 * x**2)
# Get optimal parameters for curve
popt, pcov = curve_fit(
f, work_years, salary, p0=[0, 0, 0], maxfev=800
)
result_salary = round(f(5.0, *popt), 2)
print(f"Optimal parameters of curve function: {popt}")
print(f"Salary volume in 5 years: {result_salary}")
Optimal parameters of curve function: [ 1.41974713e+02 -1.27628609e+02 -1.54383944e-02] Salary volume in 5 years: 55.21
# Sanity check
# Find percentage changes between between t+1 and t
pct = [(salary[i+1] - salary[i]) / salary[i] for i in range(len(work_years)-1)]
interval = [work_years[i+1] - work_years[i] for i in range(len(work_years)-1)]
pct_changes = pd.DataFrame(data={'pct': pct, 'interval': interval})
print(pct_changes.head(5))
# Find medians for percentage changes for time intervals equal 0.2 and 0.3
median_02 = np.median(pct_changes[pct_changes['interval'] < 0.25]['pct'])
median_03 = np.median(pct_changes[pct_changes['interval'] > 0.25]['pct'])
# Calculate salary in 5 years as last salary multiplied percentage changes
result_salary_check = round(
salary[-1] * (1 + median_03) * (1 + median_03) * (1 + median_02) * (1 + median_03),
2
)
print(f"Salary volume (sanity check): {result_salary_check}")
pct interval 0 -0.150590 0.3 1 0.178922 0.2 2 0.031185 0.3 3 0.094758 0.3 4 0.240025 0.2 Salary volume (sanity check): 55.05
# Visualize results
salary_hat = [f(i, *popt) for i in work_years]
print(f"RMSE: {mean_squared_error(salary, salary_hat)}")
# Visualize data
fig2 = go.Figure(fig2)
fig2.add_trace(
go.Scatter(
x=work_years,
y=salary_hat,
name='Salary estimation',
mode='markers',
line=dict(color='green')
)
)
fig2.add_trace(
go.Scatter(
x=[5.0],
y=[result_salary],
name='Salary in 5 years',
mode='markers',
line=dict(color='red')
)
)
fig2.show()
RMSE: 2.3029310448127034
# Save result into given file
book = load_workbook('Задание прогноз.xlsx')
sheet2 = book['Задание_2']
sheet2['B18'] = result_salary
book.save('Задание прогноз.xlsx')